##### ####################################################
#####                                               ######
#####                 Topic analysis            
#####                                               ######
##### ####################################################

# init ------------------------------------------------------------

rm(list=ls())
set.seed(221186)

# Load libraries

library(data.table) # 1.11.4
library(stargazer) # 5.2.2
library(stm) # 1.3.3

source("utils.R")

get.coefficients <- function(mod,coefficent.number){
	mod.summary <- summary(mod)
	coefficient <- coef(mod.summary)[coefficent.number,1]
	se <- coef(mod.summary)[coefficent.number,2]	
	upper <- coefficient + 1.96 * se
	lower <- coefficient - 1.96 * se
	out.vec <- c(coefficient,se,upper,lower)
	names(out.vec) <- c("Coef","SE","Hi","Lo")
	return(out.vec)
}

## ############################
## Load data
## ############################

load("data/speeches.Rdata")

## Get topic files
mod.outs <- list.files("working/topics/mod.out")
mod.outs <- mod.outs[order(as.numeric(gsub(".Rdata","",gsub("all_","",mod.outs))))]

second.stage <- list()

## Loop over topic counts

for(mod in mod.outs){

	n.topics <- gsub(".Rdata","",mod)
	
	## Load topic model output
  
	load(paste0("working/topics/prep.Rdata"))
  load(paste0("working/topics/mod.out/",n.topics,".Rdata"))

  ## Bind meta data with topics
  topics <- data.table(prep$meta, mod.out$theta)


  ## Find topic names

  topic.names <- apply(labelTopics(mod.out)$frex,1,function(x)paste(x,collapse="_"))
  topic.names <- gsub("-","_",topic.names)
  
  setnames(topics,names(topics)[grep("V",names(topics))],topic.names)

  ## Merge topics with speech data
  
  setkey(topics, ministry, subsection_id)
  setkey(speeches, debate_department, subsection_id)

  final.topics <- merge(topics, speeches)

  ## Find mean topic score for each debate (should be the same as unique because topics were calculated at the debate level)
  
  topic.out <- final.topics[,lapply(.SD,function(x) mean(x,na.rm=T)),.SDcols=c(topic.names), by = list(subsection_id, ministry)]

  ## Calculate female participation measures
  
  out <- final.topics[,list(
			prop.womeword_count=sum(word_count[Gender=="F"],na.rm=T)/sum(word_count),
			prop.women.speeches=length(body[Gender=="F"])/length(body),
			ratio.womeword_count=(sum(word_count[Gender=="F"],na.rm=T)/sum(word_count))/unique(prop_women),
			ratio.women.speeches=(length(body[Gender=="F"])/length(body))/unique(prop_women),
			minister.gender=unique(minister_gender),
			gov=unique(gov),
			yearmon=unique(yearmon)),by=list(subsection_id, debate_department)]


  ## Merge it all together
  
  setkey(out, debate_department, subsection_id)
  setkey(topic.out, ministry, subsection_id)
  setnames(topic.out, "ministry", "debate_department")

  out <- merge(out, topic.out)

  ## First stage - what is the baseline of how much women speak on each topic?

  ## Train linear model on male ministers data
  
  male.model <- lm(as.formula(paste("ratio.womeword_count ~ -1 +",paste(topic.names,collapse="+"))), data = out[out$minister.gender=="M"])

  ## Predict fitted values for all debates
  
  out$fitted <- predict(male.model, out)

  out$minister.gender <- as.factor(out$minister.gender)

  out$minister.gender <- factor(out$minister.gender, levels = c("M","F"))
  
  ## Second stage - does the expected female participation increase when a woman takes office?

  ## Retreive beta coefficients
  
  baseline.coef <- data.table(names(coef(male.model)),coef(male.model))
  setnames(baseline.coef,c("V1","V2"),c("topic","female.friendly"))

  baseline.coef <- baseline.coef[order(-baseline.coef$female.friendly),]
  baseline.coef$topic <- reorder(baseline.coef$topic,baseline.coef$female.friendly)
  names(baseline.coef) <- c("Topic","beta")


  ## Create table, ordered by beta coefficients
  sink(paste0("latex/tables/topics/female.friendly_",n.topics,".tex"))
  stargazer((baseline.coef),summary=F,rownames=F,label="female_friendly",title="Topics ordered by their association with female participation under male ministers")
  sink()

  ## For each topic, does the use of that topic increase when the minister is a woman?

  coef.out.change <- data.table(data.frame(t(sapply(1:length(topic.names),function(x) t(c(topic=topic.names[x],get.coefficients(lm(as.formula(paste(topic.names[x],"~ minister.gender + debate_department")),data=out),2)))))))
  
  setnames(coef.out.change,names(coef.out.change),c("Topic","Coef","SE","Hi","Lo"))
  setkey(baseline.coef,Topic)
  setkey(coef.out.change,Topic)

  coef.out.change <- merge(baseline.coef,coef.out.change)
  coef.out.change$Coef <- as.numeric(as.character(coef.out.change$Coef))
  cex.size <- 1.4

## Plot first stage coefficient against second stage coefficient
plot(coef.out.change$beta,coef.out.change$Coef,col="grey",xlab="b",ylab=expression(gamma),bty="n",cex.lab=cex.size,cex.main=cex.size,cex.axis=cex.size, main= paste("K =",gsub("all_","",n.topics)))
abline(lm(Coef ~ beta,data=coef.out.change),lty=2,lwd=2)

count <- gsub("all_","",n.topics)

## Store third stage coefficient
second.stage[[count]] <- lm(Coef ~ beta,data=coef.out.change)


}

## Plot third stage coefficients

combined.coefficients <- data.frame(Topic=as.numeric(names(second.stage)),do.call("rbind",lapply(second.stage,function(x) get.coefficients(x,2))))

combined.coefficients <- combined.coefficients[combined.coefficients$Topic%in%c(30:70),]


png("plots/third_stage_coefficients.png",800,800)
cex.size <- 1.2
par(mfrow=c(1,1),cex=cex.size, mar=c(5,5,4,2))
lims <- range(c(combined.coefficients$Lo-0.01,combined.coefficients$Hi+0.01))
plot(combined.coefficients$Topic,combined.coefficients$Coef,ylim=lims,bty="n",pch=19, xlab="Number of topics",ylab=expression(zeta[1]), cex.axis=cex.size, cex.lab=cex.size, xaxt = "n")
axis(1, at = combined.coefficients$Topic)
segments(combined.coefficients$Topic,y0=combined.coefficients$Lo,x1=combined.coefficients$Topic,y1=combined.coefficients$Hi,lwd= cex.size*1.5)
abline(h=0, lty=2, lwd= cex.size*1.5)
abline(v = seq(20,100,5), lty = 3, lwd= cex.size)
dev.off()



